More Tidymodels

Lecture 23

Dr. Colin Rundel

Hotels Data

Original data from Antonio, Almeida, and Nunes (2019), Data dictionary

hotels = read_csv(
  'https://tidymodels.org/start/case-study/hotels.csv'
) |>
  mutate(
    across(where(is.character), as.factor)
  )

The data

glimpse(hotels)
Rows: 50,000
Columns: 23
$ hotel                          <fct> City_Hotel, City_Hotel, Resort_Hotel, Resort_Hotel, Re…
$ lead_time                      <dbl> 217, 2, 95, 143, 136, 67, 47, 56, 80, 6, 130, 27, 16, …
$ stays_in_weekend_nights        <dbl> 1, 0, 2, 2, 1, 2, 0, 0, 0, 2, 1, 0, 1, 0, 1, 1, 1, 4, …
$ stays_in_week_nights           <dbl> 3, 1, 5, 6, 4, 2, 2, 3, 4, 2, 2, 1, 2, 2, 1, 1, 2, 7, …
$ adults                         <dbl> 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, …
$ children                       <fct> none, none, none, none, none, none, children, children…
$ meal                           <fct> BB, BB, BB, HB, HB, SC, BB, BB, BB, BB, BB, BB, BB, BB…
$ country                        <fct> DEU, PRT, GBR, ROU, PRT, GBR, ESP, ESP, FRA, FRA, FRA,…
$ market_segment                 <fct> Offline_TA/TO, Direct, Online_TA, Online_TA, Direct, O…
$ distribution_channel           <fct> TA/TO, Direct, TA/TO, TA/TO, Direct, TA/TO, Direct, TA…
$ is_repeated_guest              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_cancellations         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ reserved_room_type             <fct> A, D, A, A, F, A, C, B, D, A, A, D, A, D, A, A, D, A, …
$ assigned_room_type             <fct> A, K, A, A, F, A, C, A, D, A, D, D, A, D, A, A, D, A, …
$ booking_changes                <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, …
$ deposit_type                   <fct> No_Deposit, No_Deposit, No_Deposit, No_Deposit, No_Dep…
$ days_in_waiting_list           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 236, 0, 0, 0…
$ customer_type                  <fct> Transient-Party, Transient, Transient, Transient, Tran…
$ average_daily_rate             <dbl> 80.75, 170.00, 8.00, 81.00, 157.60, 49.09, 289.00, 82.…
$ required_car_parking_spaces    <fct> none, none, none, none, none, none, none, none, none, …
$ total_of_special_requests      <dbl> 1, 3, 2, 1, 4, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, …
$ arrival_date                   <date> 2016-09-01, 2017-08-25, 2016-11-19, 2016-04-26, 2016-…

The model

Our goal is to develop a predictive model that is able to predict whether a booking will include children or not based on the other characteristics of the booking.

hotels |>
  count(children) |>
  mutate(prop = n/sum(n))
# A tibble: 2 × 3
  children     n   prop
  <fct>    <int>  <dbl>
1 children  4038 0.0808
2 none     45962 0.919 

Clustering the test/train split

set.seed(123)

splits = initial_split(
  hotels, strata = children
)

hotel_train = training(splits)
hotel_test = testing(splits)
dim(hotel_train)
[1] 37500    23
dim(hotel_test)
[1] 12500    23
hotel_train |>
  count(children) |>
  mutate(prop = n/sum(n))
# A tibble: 2 × 3
  children     n   prop
  <fct>    <int>  <dbl>
1 children  3027 0.0807
2 none     34473 0.919 
hotel_test |>
  count(children) |>
  mutate(prop = n/sum(n))
# A tibble: 2 × 3
  children     n   prop
  <fct>    <int>  <dbl>
1 children  1011 0.0809
2 none     11489 0.919 

Logistic Regression model

show_engines("logistic_reg")
# A tibble: 7 × 2
  engine    mode          
  <chr>     <chr>         
1 glm       classification
2 glmnet    classification
3 LiblineaR classification
4 spark     classification
5 keras     classification
6 stan      classification
7 brulee    classification
lr_model = logistic_reg() |>
  set_engine("glm")
translate(lr_model)
Logistic Regression Model Specification (classification)

Computational engine: glm 

Model fit template:
stats::glm(formula = missing_arg(), data = missing_arg(), weights = missing_arg(), 
    family = stats::binomial)

Recipe

holidays = c("AllSouls", "AshWednesday", "ChristmasEve", "Easter", 
              "ChristmasDay", "GoodFriday", "NewYearsDay", "PalmSunday")

lr_recipe = recipe(children ~ ., data = hotel_train) |> 
  step_date(arrival_date) |> 
  step_holiday(arrival_date, holidays = holidays) |> 
  step_rm(arrival_date) |> 
  step_rm(country) |>
  step_dummy(all_nominal_predictors()) |> 
  step_zv(all_predictors())

lr_recipe

lr_recipe |>
  prep() |>
  bake(new_data = hotel_train)
# A tibble: 37,500 × 76
   lead_time stays_in_weekend_nights stays_in_week_nights adults is_repeated_guest
       <dbl>                   <dbl>                <dbl>  <dbl>             <dbl>
 1         2                       0                    1      2                 0
 2        95                       2                    5      2                 0
 3        67                       2                    2      2                 0
 4        47                       0                    2      2                 0
 5        56                       0                    3      0                 0
 6         6                       2                    2      2                 0
 7       130                       1                    2      2                 0
 8        27                       0                    1      1                 0
 9        46                       0                    2      2                 0
10       423                       1                    1      2                 0
# ℹ 37,490 more rows
# ℹ 71 more variables: previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
#   booking_changes <dbl>, days_in_waiting_list <dbl>, average_daily_rate <dbl>,
#   total_of_special_requests <dbl>, children <fct>, arrival_date_year <int>,
#   arrival_date_AllSouls <int>, arrival_date_AshWednesday <int>, arrival_date_ChristmasEve <int>,
#   arrival_date_Easter <int>, arrival_date_ChristmasDay <int>, arrival_date_GoodFriday <int>,
#   arrival_date_NewYearsDay <int>, arrival_date_PalmSunday <int>, hotel_Resort_Hotel <dbl>, …

Workflow

( lr_work = workflow() |>
    add_model(lr_model) |>
    add_recipe(lr_recipe) 
)
══ Workflow ════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()

── Preprocessor ────────────────────────────────────────────
6 Recipe Steps

• step_date()
• step_holiday()
• step_rm()
• step_rm()
• step_dummy()
• step_zv()

── Model ───────────────────────────────────────────────────
Logistic Regression Model Specification (classification)

Computational engine: glm 

Fit

( lr_fit = lr_work |>
    fit(data = hotel_train) )
══ Workflow [trained] ══════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()

── Preprocessor ────────────────────────────────────────────
6 Recipe Steps

• step_date()
• step_holiday()
• step_rm()
• step_rm()
• step_dummy()
• step_zv()

── Model ───────────────────────────────────────────────────

Call:  stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)

Coefficients:
                        (Intercept)  
                         -2.543e+02  
                          lead_time  
                         -1.287e-03  
            stays_in_weekend_nights  
                          5.231e-02  
               stays_in_week_nights  
                         -3.433e-02  
                             adults  
                          7.328e-01  
                  is_repeated_guest  
                          3.962e-01  
             previous_cancellations  
                          2.147e-01  
     previous_bookings_not_canceled  
                          3.728e-01  
                    booking_changes  
                         -2.396e-01  
               days_in_waiting_list  
                          6.415e-03  
                 average_daily_rate  
                         -1.049e-02  
          total_of_special_requests  
                         -4.936e-01  
                  arrival_date_year  
                          1.344e-01  
              arrival_date_AllSouls  
                          1.006e+00  
          arrival_date_AshWednesday  
                          2.019e-01  
          arrival_date_ChristmasEve  
                          5.328e-01  
                arrival_date_Easter  
                         -9.749e-01  
          arrival_date_ChristmasDay  
                         -6.875e-01  
            arrival_date_GoodFriday  
                         -1.593e-01  
           arrival_date_NewYearsDay  
                         -1.185e+00  
            arrival_date_PalmSunday  
                         -6.243e-01  
                 hotel_Resort_Hotel  
                          9.581e-01  
                            meal_FB  
                         -6.348e-01  

...
and 110 more lines.

Logistic regression predictions

( lr_train_perf = lr_fit |>
    augment(new_data = hotel_train) |>
    select(children, starts_with(".pred")) )
# A tibble: 37,500 × 4
   children .pred_class .pred_children .pred_none
   <fct>    <fct>                <dbl>      <dbl>
 1 none     none                0.0861     0.914 
 2 none     none                0.0178     0.982 
 3 none     none                0.0101     0.990 
 4 children children            0.931      0.0693
 5 children none                0.473      0.527 
 6 children none                0.144      0.856 
 7 none     none                0.0710     0.929 
 8 none     none                0.0596     0.940 
 9 none     none                0.0252     0.975 
10 none     none                0.0735     0.926 
# ℹ 37,490 more rows
( lr_test_perf = lr_fit |>
    augment(new_data = hotel_test) |>
    select(children, starts_with(".pred")) )
# A tibble: 12,500 × 4
   children .pred_class .pred_children .pred_none
   <fct>    <fct>                <dbl>      <dbl>
 1 none     none              0.00854       0.991
 2 none     none              0.0202        0.980
 3 none     children          0.757         0.243
 4 none     none              0.0373        0.963
 5 none     none              0.000975      0.999
 6 none     none              0.000474      1.00 
 7 none     none              0.0736        0.926
 8 none     none              0.0748        0.925
 9 none     none              0.0532        0.947
10 none     none              0.0794        0.921
# ℹ 12,490 more rows

Performance metrics (within-sample)

conf_mat(lr_train_perf, children, .pred_class)
          Truth
Prediction children  none
  children     1075   420
  none         1952 34053
accuracy(lr_train_perf, children, .pred_class)
# A tibble: 1 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.937
precision(lr_train_perf, children, .pred_class)
# A tibble: 1 × 3
  .metric   .estimator .estimate
  <chr>     <chr>          <dbl>
1 precision binary         0.719
yardstick::roc_curve(
  lr_train_perf,
  children,
  .pred_children
) |>
  autoplot()

roc_auc(lr_train_perf, children, .pred_children)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.881

Performance metrics (out-of-sample)

conf_mat(lr_test_perf, children, .pred_class)
          Truth
Prediction children  none
  children      359   137
  none          652 11352
accuracy(lr_test_perf, children, .pred_class)
# A tibble: 1 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.937
precision(lr_test_perf, children, .pred_class)
# A tibble: 1 × 3
  .metric   .estimator .estimate
  <chr>     <chr>          <dbl>
1 precision binary         0.724
  yardstick::roc_curve(
    lr_test_perf,
    children,
    .pred_children
  ) |>
  autoplot()

roc_auc(lr_test_perf, children, .pred_children)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.864

Combining ROC curves

lr_train_roc = lr_train_perf |>
  yardstick::roc_curve(
    children, .pred_children
  ) |> 
  mutate(name="logistic - train")

lr_test_roc = lr_test_perf |>
  yardstick::roc_curve(
    children, .pred_children
  ) |> 
  mutate(name="logistic - test")

bind_rows(
  lr_train_roc,
  lr_test_roc
) |>
  ggplot(aes(x = 1 - specificity, y = sensitivity, col = name)) + 
    geom_path(lwd = 1.5, alpha = 0.8) +
    geom_abline(lty = 3) + 
    coord_equal()

Lasso

Lasso Model

For this we will be using the glmnet package which supports fitting lasso, ridge and elastic net models.

lasso_model = logistic_reg(penalty = tune(), mixture = 1) |>
  set_engine("glmnet")
  • mixture determines the type of model fit

    • 1 for Lasso,

    • 0 for Ridge,

    • other for elastic net.

  • penalty is \(\lambda\) in the lasso model, scales the penalty for coefficient size.

lasso_model |> 
  parameters()
Collection of 1 parameters for tuning

 identifier    type    object
    penalty penalty nparam[+]
lasso_model |>
  translate()
Logistic Regression Model Specification (classification)

Main Arguments:
  penalty = tune()
  mixture = 1

Computational engine: glmnet 

Model fit template:
glmnet::glmnet(x = missing_arg(), y = missing_arg(), weights = missing_arg(), 
    alpha = 1, family = "binomial")

Lasso Recipe

Lasso (and Ridge) models are sensitive to the scale of the model features, and so a standard approach is to normalize all features before fitting the model.

lasso_recipe = lr_recipe |>
  step_normalize(all_predictors())
lasso_recipe |>
  prep() |>
  bake(new_data = hotel_train)
# A tibble: 37,500 × 76
   lead_time stays_in_weekend_nights stays_in_week_nights
       <dbl>                   <dbl>                <dbl>
 1    -0.858                 -0.938                -0.767
 2     0.160                  1.09                  1.32 
 3    -0.146                  1.09                 -0.245
 4    -0.365                 -0.938                -0.245
 5    -0.267                 -0.938                 0.278
 6    -0.814                  1.09                 -0.245
 7     0.544                  0.0735               -0.245
 8    -0.584                 -0.938                -0.767
 9    -0.376                 -0.938                -0.245
10     3.75                   0.0735               -0.767
# ℹ 37,490 more rows
# ℹ 73 more variables: adults <dbl>,
#   is_repeated_guest <dbl>, previous_cancellations <dbl>,
#   previous_bookings_not_canceled <dbl>,
#   booking_changes <dbl>, days_in_waiting_list <dbl>,
#   average_daily_rate <dbl>,
#   total_of_special_requests <dbl>, children <fct>, …

Lasso workflow

( lasso_work = workflow() |>
    add_model(lasso_model) |>
    add_recipe(lasso_recipe)
)
══ Workflow ════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()

── Preprocessor ────────────────────────────────────────────
7 Recipe Steps

• step_date()
• step_holiday()
• step_rm()
• step_rm()
• step_dummy()
• step_zv()
• step_normalize()

── Model ───────────────────────────────────────────────────
Logistic Regression Model Specification (classification)

Main Arguments:
  penalty = tune()
  mixture = 1

Computational engine: glmnet 

v-folds for hyperparameter tuning

( hotel_vf = rsample::vfold_cv(hotel_train, v=5, strata = children) )
#  5-fold cross-validation using stratification 
# A tibble: 5 × 2
  splits               id   
  <list>               <chr>
1 <split [30000/7500]> Fold1
2 <split [30000/7500]> Fold2
3 <split [30000/7500]> Fold3
4 <split [30000/7500]> Fold4
5 <split [30000/7500]> Fold5

Results

lasso_grid |>
  collect_metrics()
# A tibble: 10 × 7
    penalty .metric .estimator  mean     n std_err
      <dbl> <chr>   <chr>      <dbl> <int>   <dbl>
 1 0.0001   roc_auc binary     0.877     5 0.00318
 2 0.000215 roc_auc binary     0.877     5 0.00316
 3 0.000464 roc_auc binary     0.877     5 0.00314
 4 0.001    roc_auc binary     0.877     5 0.00304
 5 0.00215  roc_auc binary     0.877     5 0.00263
 6 0.00464  roc_auc binary     0.870     5 0.00253
 7 0.01     roc_auc binary     0.853     5 0.00249
 8 0.0215   roc_auc binary     0.824     5 0.00424
 9 0.0464   roc_auc binary     0.797     5 0.00400
10 0.1      roc_auc binary     0.5       5 0      
# ℹ 1 more variable: .config <chr>
lasso_grid |> 
  collect_metrics() |> 
  ggplot(aes(x = penalty, y = mean)) + 
    geom_point() + 
    geom_line() + 
    ylab("Area under the ROC Curve") +
    scale_x_log10(labels = scales::label_number())

“Best” models

lasso_grid |>
  show_best("roc_auc", n=10)
# A tibble: 10 × 7
    penalty .metric .estimator  mean     n std_err .config  
      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>    
 1 0.001    roc_auc binary     0.877     5 0.00304 Preproce…
 2 0.00215  roc_auc binary     0.877     5 0.00263 Preproce…
 3 0.000464 roc_auc binary     0.877     5 0.00314 Preproce…
 4 0.000215 roc_auc binary     0.877     5 0.00316 Preproce…
 5 0.0001   roc_auc binary     0.877     5 0.00318 Preproce…
 6 0.00464  roc_auc binary     0.870     5 0.00253 Preproce…
 7 0.01     roc_auc binary     0.853     5 0.00249 Preproce…
 8 0.0215   roc_auc binary     0.824     5 0.00424 Preproce…
 9 0.0464   roc_auc binary     0.797     5 0.00400 Preproce…
10 0.1      roc_auc binary     0.5       5 0       Preproce…

“Best” model

( lasso_best = lasso_grid |>
    collect_metrics() |>
    mutate(mean = round(mean, 2)) |>
    arrange(desc(mean), desc(penalty)) |>
    slice(1) )
# A tibble: 1 × 7
  penalty .metric .estimator  mean     n std_err .config    
    <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>      
1 0.00215 roc_auc binary      0.88     5 0.00263 Preprocess…

Extracting predictions

Since we used control_grid(save_pred = TRUE) with tune_grid() we can recover the predictions for the out-of-sample values for each fold:

( lasso_train_perf = lasso_grid |>
    collect_predictions(parameters = lasso_best) )
# A tibble: 37,500 × 7
   id    .pred_children .pred_none  .row penalty children
   <chr>          <dbl>      <dbl> <int>   <dbl> <fct>   
 1 Fold1         0.366       0.634     5 0.00215 children
 2 Fold1         0.144       0.856     6 0.00215 children
 3 Fold1         0.0542      0.946    19 0.00215 none    
 4 Fold1         0.0266      0.973    21 0.00215 none    
 5 Fold1         0.106       0.894    22 0.00215 children
 6 Fold1         0.0286      0.971    23 0.00215 none    
 7 Fold1         0.0205      0.980    30 0.00215 none    
 8 Fold1         0.0192      0.981    31 0.00215 none    
 9 Fold1         0.0431      0.957    32 0.00215 none    
10 Fold1         0.0532      0.947    35 0.00215 none    
# ℹ 37,490 more rows
# ℹ 1 more variable: .config <chr>

lasso_train_perf |>
  roc_curve(children, .pred_children) |>
  autoplot()

lasso_train_perf |>
  roc_auc(children, .pred_children)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.877

Re-fitting

Typically with a tuned model we will refit using the complete test data and the “best” parameter value(s),

lasso_work_tuned = update_model(
  lasso_work, 
  logistic_reg(
    mixture = 1, 
    penalty = lasso_best$penalty
  ) |>
    set_engine("glmnet")
)

lasso_fit = lasso_work_tuned |>
  fit(data=hotel_train)

Test Performance (out-of-sample)

lasso_test_perf = lasso_fit |>
  augment(new_data = hotel_test) |>
  select(children, starts_with(".pred"))
conf_mat(lasso_test_perf, children, .pred_class)
          Truth
Prediction children  none
  children      330   109
  none          681 11380
accuracy(lasso_test_perf, children, .pred_class)
# A tibble: 1 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.937
precision(lasso_test_perf, children, .pred_class)
# A tibble: 1 × 3
  .metric   .estimator .estimate
  <chr>     <chr>          <dbl>
1 precision binary         0.752
lasso_roc = yardstick::roc_curve(
    lasso_test_perf,
    children,
    .pred_children
  ) |>
  mutate(name = "lasso - test")
lasso_roc |>
  autoplot()

roc_auc(lasso_test_perf, children, .pred_children)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.866

Comparing models

Decision tree

Decision tree models

show_engines("decision_tree")
# A tibble: 5 × 2
  engine mode          
  <chr>  <chr>         
1 rpart  classification
2 rpart  regression    
3 C5.0   classification
4 spark  classification
5 spark  regression    
dt_model = decision_tree(
  tree_depth = tune(), 
  min_n = tune(),
  cost_complexity = tune()
) |> 
  set_engine("rpart") |> 
  set_mode("classification")

Recipe & workflow

We skip dummy coding in the recipe as it is not needed by rpart,

dt_recipe = recipe(children ~ ., data = hotel_train) |> 
  step_date(arrival_date) |> 
  step_holiday(arrival_date, holidays = holidays) |> 
  step_rm(arrival_date) |>
  step_rm(country)
dt_work = workflow() |> 
  add_model(dt_model) |> 
  add_recipe(dt_recipe)

Tuning

( dt_grid = grid_regular(
    cost_complexity(), 
    tree_depth(), 
    min_n(), 
    levels = 3
) )
# A tibble: 27 × 3
   cost_complexity tree_depth min_n
             <dbl>      <int> <int>
 1    0.0000000001          1     2
 2    0.00000316            1     2
 3    0.1                   1     2
 4    0.0000000001          8     2
 5    0.00000316            8     2
 6    0.1                   8     2
 7    0.0000000001         15     2
 8    0.00000316           15     2
 9    0.1                  15     2
10    0.0000000001          1    21
# ℹ 17 more rows
doFuture::registerDoFuture()
future::plan(future::multisession, workers = 8)
dt_tune = dt_work |> 
  tune_grid(
    hotel_vf,
    grid = dt_grid,
    control = control_grid(save_pred = TRUE),
    metrics = metric_set(roc_auc)
  )

Tuning results

dt_tune |>
  collect_metrics() |>
  arrange(desc(mean))
# A tibble: 27 × 9
   cost_complexity tree_depth min_n .metric .estimator  mean
             <dbl>      <int> <int> <chr>   <chr>      <dbl>
 1    0.0000000001         15    21 roc_auc binary     0.867
 2    0.00000316           15    21 roc_auc binary     0.867
 3    0.0000000001         15    40 roc_auc binary     0.863
 4    0.00000316           15    40 roc_auc binary     0.863
 5    0.0000000001          8    21 roc_auc binary     0.848
 6    0.00000316            8    21 roc_auc binary     0.848
 7    0.0000000001          8    40 roc_auc binary     0.846
 8    0.00000316            8    40 roc_auc binary     0.846
 9    0.0000000001          8     2 roc_auc binary     0.843
10    0.00000316            8     2 roc_auc binary     0.843
# ℹ 17 more rows
# ℹ 3 more variables: n <int>, std_err <dbl>, .config <chr>

“Best” parameters

dt_tune |> 
  show_best(metric = "roc_auc")
# A tibble: 5 × 9
  cost_complexity tree_depth min_n .metric
            <dbl>      <int> <int> <chr>  
1    0.0000000001         15    21 roc_auc
2    0.00000316           15    21 roc_auc
3    0.0000000001         15    40 roc_auc
4    0.00000316           15    40 roc_auc
5    0.0000000001          8    21 roc_auc
# ℹ 5 more variables: .estimator <chr>,
#   mean <dbl>, n <int>, std_err <dbl>,
#   .config <chr>
autoplot(dt_tune)

Refitting

(dt_best = dt_tune |>
  select_best(metric = "roc_auc"))
# A tibble: 1 × 4
  cost_complexity tree_depth min_n .config              
            <dbl>      <int> <int> <chr>                
1    0.0000000001         15    21 Preprocessor1_Model16
dt_work_tuned = update_model(
  dt_work, 
  decision_tree(
    tree_depth = dt_best$tree_depth,
    min_n = dt_best$min_n,
    cost_complexity = dt_best$cost_complexity
  ) |>
    set_engine("rpart") |>
    set_mode("classification")
)

dt_fit = dt_work_tuned |>
  fit(data=hotel_train)

Model extraction

dt_fit |> 
  hardhat::extract_fit_engine() |> 
  plot()

Test Performance (out-of-sample)

dt_test_perf = dt_fit |>
  augment(new_data = hotel_test) |>
  select(children, starts_with(".pred"))
conf_mat(dt_test_perf, children, .pred_class)
          Truth
Prediction children  none
  children      444   270
  none          567 11219
accuracy(dt_test_perf, children, .pred_class)
# A tibble: 1 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.933
precision(dt_test_perf, children, .pred_class)
# A tibble: 1 × 3
  .metric   .estimator .estimate
  <chr>     <chr>          <dbl>
1 precision binary         0.622
dt_roc = yardstick::roc_curve(
    dt_test_perf,
    children,
    .pred_children
  ) |>
  mutate(name = "DT - test")
dt_roc |>
  autoplot()

roc_auc(dt_test_perf, children, .pred_children)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.858

Comparing models

Random Forest

Random forest models

show_engines("rand_forest")
# A tibble: 6 × 2
  engine       mode          
  <chr>        <chr>         
1 ranger       classification
2 ranger       regression    
3 randomForest classification
4 randomForest regression    
5 spark        classification
6 spark        regression    
rf_model = rand_forest(mtry = tune(), min_n = tune(), trees = 100) |> 
  set_engine("ranger", num.threads = 8) |> 
  set_mode("classification")

Recipe & workflow

We skip dummy coding in the recipe as it is not needed by ranger,

rf_recipe = recipe(children ~ ., data = hotel_train) |> 
  step_date(arrival_date) |> 
  step_holiday(arrival_date, holidays = holidays) |> 
  step_rm(arrival_date) |>
  step_rm(country)
rf_work = workflow() |> 
  add_model(rf_model) |> 
  add_recipe(rf_recipe)

“Best” parameters

rf_tune |> 
  show_best(metric = "roc_auc")
# A tibble: 5 × 8
   mtry min_n .metric .estimator  mean     n
  <int> <int> <chr>   <chr>      <dbl> <int>
1     5     3 roc_auc binary     0.918     5
2     9    31 roc_auc binary     0.916     5
3    10    21 roc_auc binary     0.915     5
4    15    23 roc_auc binary     0.912     5
5    18    38 roc_auc binary     0.911     5
# ℹ 2 more variables: std_err <dbl>,
#   .config <chr>
autoplot(rf_tune)

Refitting

(rf_best = rf_tune |>
  select_best(metric = "roc_auc"))
# A tibble: 1 × 3
   mtry min_n .config              
  <int> <int> <chr>                
1     5     3 Preprocessor1_Model03
rf_work_tuned = update_model(
  rf_work, 
  rand_forest(
    trees = 100,
    mtry = rf_best$mtry, 
    min_n = rf_best$min_n
  ) |>
    set_engine("ranger", num.threads = 8) |>
    set_mode("classification")
)

rf_fit = rf_work_tuned |>
  fit(data=hotel_train)

Test Performance (out-of-sample)

rf_test_perf = rf_fit |>
  augment(new_data = hotel_test) |>
  select(children, starts_with(".pred"))
conf_mat(rf_test_perf, children, .pred_class)
          Truth
Prediction children  none
  children      388    70
  none          623 11419
accuracy(rf_test_perf, children, .pred_class)
# A tibble: 1 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.945
precision(rf_test_perf, children, .pred_class)
# A tibble: 1 × 3
  .metric   .estimator .estimate
  <chr>     <chr>          <dbl>
1 precision binary         0.847
rf_roc = yardstick::roc_curve(
    rf_test_perf,
    children,
    .pred_children
  ) |>
  mutate(name = "RF - test")
rf_roc |>
  autoplot()

roc_auc(rf_test_perf, children, .pred_children)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.923

Comparing models